#
#
# Table_S1.R.
#
# Producing table S1 from the
# Glassdoor dataset (graph)
#
# Jeffrey Tu, j4tu@ucsd.edu
#

# rm(list = ls())
# setwd("SET WD HERE")

# Libraries 
library(dplyr)
library(data.table)
library(tidytext)
library(stringr)
library(stargazer)

if (F) { # Create data from the full data set; computationally intensive
  # Load the dataset and clean
  glassdoordata <- fread('all_reviews.csv')
  glassdoordata <- glassdoordata %>%
    select(title, rating, `Compensation and Benefits`) %>%
    # Rename columns for clarity
    rename(headline = title) %>%
    rename(overall_rating = rating) %>%
    rename(comp_benefits = `Compensation and Benefits`) %>%   
    # Convert ratings to numeric values
    mutate(overall_rating = as.numeric(overall_rating)) %>%
    mutate(comp_benefits = as.numeric(comp_benefits))

  ## String detection
  # Define keywords and variable names
  keywords <- c("burea", "long hours", "low pay", "conflict", "stress")
  varnames <- c("bDetectHead", "lhDetectHead", "lpDetectHead", "cDetectHead", "sDetectHead")

  # Loop through keywords and create binaries
  for (i in seq_along(keywords)) {
    glassdoordata[[varnames[i]]] <- as.integer(grepl(keywords[i], glassdoordata$headline, ignore.case = TRUE))
  }

  # Save the dataframe so it can processed by graphs.R which will produce table S1
  write.csv(glassdoordata, 'cleanedData.csv')
  
} else {
  # Load in the cleaned data, distributed with the replication archive.
  glassdoordata <- fread('cleanedData.csv') %>% select(-V1)
}


# Table of summary statistics 

# List of variables to be included
variables <- c("bDetectHead", "comp_benefits", "lhDetectHead", "lpDetectHead", "cDetectHead", "sDetectHead", "overall_rating")

# Calculate mean and standard deviations of all variables
summary_stats <- glassdoordata[, .(
  Mean = sapply(.SD, function(x) as.numeric(mean(x, na.rm = TRUE))),
  SD = sapply(.SD, function(x) as.numeric(sd(x, na.rm = TRUE)))
), .SDcols = variables]

# Transpose the summary table for easier viewing
summary_stats[, Variable := c("Bureaucracy mentioned in headline", "Numeric rating of compensation and benefits", "Long hours mentioned in headline", "Low pay mentioned in headline", "Conflict mentioned in headline", "Stress mentioned in headline", "Overall Rating of Employer")]
summary_stats <- as.data.frame(summary_stats)

# Display the table of summary statistics using stargazer
stargazer(summary_stats, type = "html", summary = FALSE, rownames = FALSE, 
          digits = 2,
          out = "Table_S1.html")
